/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection

Created on: 1/28/2022
Last Modified on: 12/16/2023

Description: This program generates investigator-level placement and outcome
rates, as well as standard errors for these estimates, but in the 2017-2019 sample - 
to look at heterogeneity in investigator UD by reporting source.

Note that we have removed the file directory names from this program for 
confidentiality reasons.
********************************************************************************/

**************************
**(0) SETUP
**************************
clear
set more off
macro drop all
capture log close
set seed 02042023

*Set directories 
global cleandata 
global tmpdata 
global rawdata
global output

*************************
**(1) GENERATE INVESTIGATORS' PLACEMENT AND MALTREATMENT RATES, AMONG CALLS INITIATED BY MANDATED REPORTERS
*************************
**Start with the universe of screened-in hotline calls in the analysis sample (after making the main sample restrictions)
use "${tmpdata}all_hotline_calls_main_restrictions_qje.dta", replace 

*Keep only calls from mandated reporters
gen mandated = edu==1 | medical==1 | counselor==1 | law==1 
keep if mandated==1
keep if screened==1

*Keep only investigators who handled at least 200 cases 
cap drop n 
bysort worker_id: gen n = _N 
drop if n<200

**Additional sample restrictions needed for the analysis (sexual abuse cases are not randomly assigned)
drop if sexab==1 

**Zip code is a key variable in the analysis 
replace zipcode_vic="." if zipcode_vic=="NA"
destring zipcode_vic, replace 
drop if zipcode_vic==. 

**Keep only investigations that are not within 365 days for the same child 
sort childpartyid cw_date_stata intake_id, stable
by childpartyid: gen diff = cw_date_stata[_n] - cw_date_stata[_n-1]
order diff 
drop if diff<365 & diff!=.

*Generate rotation fixed effects
egen rotation = group(cps_year zipcode_vic)

**Globals for variable lists
global fixedeffect = "rotation"

**Keep only variables of interest 
keep worker_id black white fc inv6m $fixedeffect childpartyid worker_id
sum worker_id black white fc inv6m $fixedeffect childpartyid worker_id

egen grpworker_id = group(worker_id)
levelsof grpworker_id
local levels = "`r(levels)'"

**Generate investigator placement, subsequent maltreatment rates, and standard errors 
* Loop over all the outcomes 
foreach var in fc inv6m {
	* Use full sample for placement rates 
	if "`var'"=="fc"{
		qui: reghdfe `var' i.grpworker_id, resid absorb($fixedeffect)
	}
	
	* For selectively observed vars, condition on not being placed  
	if "`var'"!="fc"{
		qui: reghdfe `var' i.grpworker_id if fc == 0, resid absorb($fixedeffect)
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen base_`var' = .
	gen se_`var' = .
	
	qui{
	foreach i in `levels' {
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		
		if _rc == 0 {
            replace base_`var' =  r(estimate) if grpworker_id == `i'
			replace se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop xbd xb d
	* By race (black and white)
	* For fc placement outcome, use the entire sample
	qui {
	if "`var'"=="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black, absorb($fixedeffect) resid
	}
	
	* For other outcomes, condition on not being placed 
	if "`var'"!="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black if fc==0, resid absorb($fixedeffect)
	}
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen b_`var' = .
	gen b_se_`var' = .
	gen w_`var' = .
	gen w_se_`var' = .
   
   	* Quietly
	qui {
	* White children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		if _rc == 0 {
            replace w_`var' =  r(estimate) if grpworker_id == `i'
			replace w_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	* Black children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + _b[`i'.grpworker_id#1.black] + `exp'
		if _rc == 0 {
            replace b_`var' =  r(estimate) if grpworker_id == `i'
			replace b_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop x xbd xb d
}


*----------------------------------------------------------------------*
* Save investigator-level dataset 
*----------------------------------------------------------------------*
bys worker_id: gen count_inv = _N
bys worker_id: egen count_black = total(black)
bys worker_id : egen count_white = total(white)

* Keep relevant vars
keep base_* b_* w_* se_* worker_id count_inv count_black count_white grpworker_id
duplicates drop worker_id, force 	
drop if worker_id == .

save "${tmpdata}inv_rates_later_sample_m_qje.dta", replace


*************************
**(2) GENERATE INVESTIGATORS' PLACEMENT AND MALTREATMENT RATES, AMONG CALLS INITIATED BY NON-MANDATED REPORTERS
*************************
**Start with the universe of screened-in hotline calls in the analysis sample (after making the main sample restrictions)
use "${tmpdata}all_hotline_calls_main_restrictions_qje.dta", replace 

*Keep only calls from family member reporters
gen mandated = edu==1 | medical==1 | law==1 | counselor==1
keep if mandated==0

*Keep only investigators who handled at least 200 cases 
cap drop n 
bysort worker_id: gen n = _N 
drop if n<200

**Additional sample restrictions needed for the analysis (sexual abuse cases are not randomly assigned)
drop if sexab==1 

**Zip code is a key variable in the analysis 
replace zipcode_vic="." if zipcode_vic=="NA"
destring zipcode_vic, replace 
drop if zipcode_vic==. 

**Keep only investigations that are not within 365 days for the same child 
sort childpartyid cw_date_stata intake_id, stable
by childpartyid: gen diff = cw_date_stata[_n] - cw_date_stata[_n-1]
order diff 
drop if diff<365 & diff!=.

*Generate rotation fixed effects
egen rotation = group(cps_year zipcode_vic)

**Globals for variable lists
global fixedeffect = "rotation"

**Keep only variables of interest 
keep worker_id black white fc inv6m $fixedeffect childpartyid worker_id
sum worker_id black white fc inv6m $fixedeffect childpartyid worker_id

egen grpworker_id = group(worker_id)
levelsof grpworker_id
local levels = "`r(levels)'"

**Generate investigator placement and subsequent maltreatment rates 
* Loop over all the outcomes 
foreach var in fc inv6m {
	* Use full sample for placement rates 
	if "`var'"=="fc"{
		qui: reghdfe `var' i.grpworker_id, resid absorb($fixedeffect)
	}
	
	* For selectively observed vars, condition on not being placed  
	if "`var'"!="fc"{
		qui: reghdfe `var' i.grpworker_id if fc == 0, resid absorb($fixedeffect)
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen base_`var' = .
	gen se_`var' = .
	
	qui{
	foreach i in `levels' {
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		
		if _rc == 0 {
            replace base_`var' =  r(estimate) if grpworker_id == `i'
			replace se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop xbd xb d
	* By race (black and white)
	* For fc placement outcome, use the entire sample
	qui {
	if "`var'"=="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black, absorb($fixedeffect) resid
	}
	
	* For other outcomes, condition on not being placed 
	if "`var'"!="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black if fc==0, resid absorb($fixedeffect)
	}
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen b_`var' = .
	gen b_se_`var' = .
	gen w_`var' = .
	gen w_se_`var' = .
   
   	* Quietly
	qui {
	* White children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		if _rc == 0 {
            replace w_`var' =  r(estimate) if grpworker_id == `i'
			replace w_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	* Black children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + _b[`i'.grpworker_id#1.black] + `exp'
		if _rc == 0 {
            replace b_`var' =  r(estimate) if grpworker_id == `i'
			replace b_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop x xbd xb d
}


*----------------------------------------------------------------------*
* Save investigator-level dataset 
*----------------------------------------------------------------------*
bys worker_id: gen count_inv = _N
bys worker_id: egen count_black = total(black)
bys worker_id : egen count_white = total(white)

* Keep relevant vars
keep base_* b_* w_* se_* worker_id count_inv count_black count_white grpworker_id
duplicates drop worker_id, force 	
drop if worker_id == .

save "${tmpdata}inv_rates_later_sample_n_qje.dta", replace


*************************
**(3) GENERATE CLUSTERED STANDARD ERRORS FOR INVESTIGATORS' PLACEMENT AND MALTREATMENT RATES, AMONG CALLS INITIATED BY MANDATED REPORTERS
*************************
**Start with the universe of screened-in hotline calls in the analysis sample (after making the main sample restrictions)
use "${tmpdata}all_hotline_calls_main_restrictions_qje.dta", replace 

*Keep only calls from mandated reporters
gen mandated = edu==1 | medical==1 | counselor==1 | law==1 
keep if mandated==1
keep if screened==1

*Keep only investigators who handled at least 200 cases 
cap drop n 
bysort worker_id: gen n = _N 
drop if n<200

**Additional sample restrictions needed for the analysis (sexual abuse cases are not randomly assigned)
drop if sexab==1 

**Zip code is a key variable in the analysis 
replace zipcode_vic="." if zipcode_vic=="NA"
destring zipcode_vic, replace 
drop if zipcode_vic==. 

**Keep only investigations that are not within 365 days for the same child 
sort childpartyid cw_date_stata intake_id, stable
by childpartyid: gen diff = cw_date_stata[_n] - cw_date_stata[_n-1]
order diff 
drop if diff<365 & diff!=.

*Generate rotation fixed effects
egen rotation = group(cps_year zipcode_vic)

**Globals for variable lists
global fixedeffect = "rotation"

**Keep only variables of interest 
keep worker_id black white fc inv6m $fixedeffect childpartyid worker_id
sum worker_id black white fc inv6m $fixedeffect childpartyid worker_id

egen grpworker_id = group(worker_id)
levelsof grpworker_id
local levels = "`r(levels)'"

**Generate investigator placement and subsequent maltreatment rates 
* Loop over all the outcomes 
foreach var in fc inv6m {
	* Utwse full sample for placement rates 
	if "`var'"=="fc"{
		qui: reghdfe `var' i.grpworker_id, resid absorb($fixedeffect) cluster(worker_id childpartyid)
	}
	
	* For twselectively obtwserved vars, condition on not being placed  
	if "`var'"!="fc"{
		qui: reghdfe `var' i.grpworker_id if fc == 0, resid absorb($fixedeffect) cluster(worker_id childpartyid)
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen base_`var' = .
	gen twse_`var' = .
	
	qui{
	foreach i in `levels' {
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		
		if _rc == 0 {
            replace base_`var' =  r(estimate) if grpworker_id == `i'
			replace twse_`var'= r(twse) if  grpworker_id==`i'
		}
	}
	}
	drop xbd xb d
	* By race (black and white)
	* For fc placement outcome, utwse the entire sample
	qui {
	if "`var'"=="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black, absorb($fixedeffect) resid cluster(worker_id childpartyid)
	}
	
	* For other outcomes, condition on not being placed 
	if "`var'"!="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black if fc==0, resid absorb($fixedeffect) cluster(worker_id childpartyid)
	}
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen b_`var' = .
	gen b_twse_`var' = .
	gen w_`var' = .
	gen w_twse_`var' = .
   
   	* Quietly
	qui {
	* White children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		if _rc == 0 {
            replace w_`var' =  r(estimate) if grpworker_id == `i'
			replace w_twse_`var'= r(twse) if  grpworker_id==`i'
		}
	}
	* Black children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + _b[`i'.grpworker_id#1.black] + `exp'
		if _rc == 0 {
            replace b_`var' =  r(estimate) if grpworker_id == `i'
			replace b_twse_`var'= r(twse) if  grpworker_id==`i'
		}
	}
	}
	drop x xbd xb d
}


*----------------------------------------------------------------------*
* Save investigator-level dataset 
*----------------------------------------------------------------------*
bys worker_id: gen count_inv = _N
bys worker_id: egen count_black = total(black)
bys worker_id : egen count_white = total(white)

* Keep relevant vars
keep base_* b_* w_* twse_* worker_id count_inv count_black count_white grpworker_id
duplicates drop worker_id, force 	
drop if worker_id == .

save "${tmpdata}inv_twse_later_sample_m_qje.dta", replace


*************************
**(4) GENERATE CLUSTERED STANDARD ERRORS FOR INVESTIGATORS' PLACEMENT AND MALTREATMENT RATES, AMONG CALLS INITIATED BY NON-MANDATED REPORTERS
*************************
**Start with the univertwse of screened-in hotline calls in the analysis sample (after making the main sample restrictions)
use "${tmpdata}all_hotline_calls_main_restrictions_qje.dta", replace 

*Keep only calls from family member reporters
gen mandated = edu==1 | medical==1 | law==1 | counselor==1
keep if mandated==0

*Keep only investigators who handled at least 200 catwses 
cap drop n 
bysort worker_id: gen n = _N 
drop if n<200

**Additional sample restrictions needed for the analysis (sexual abuse cases are not randomly assigned)
drop if sexab==1 

**Zip code is a key variable in the analysis 
replace zipcode_vic="." if zipcode_vic=="NA"
destring zipcode_vic, replace 
drop if zipcode_vic==. 

**Keep only investigations that are not within 365 days for the same child 
sort childpartyid cw_date_stata intake_id, stable
by childpartyid: gen diff = cw_date_stata[_n] - cw_date_stata[_n-1]
order diff 
drop if diff<365 & diff!=.

*Generate rotation fixed effects
egen rotation = group(cps_year zipcode_vic)

**Globals for variable lists
global fixedeffect = "rotation"

**Keep only variables of interest 
keep worker_id black white fc inv6m $fixedeffect childpartyid worker_id
sum worker_id black white fc inv6m $fixedeffect childpartyid worker_id

egen grpworker_id = group(worker_id)
levelsof grpworker_id
local levels = "`r(levels)'"

**Generate investigator placement and subtwsequent maltreatment rates 
* Loop over all the outcomes 
foreach var in fc inv6m {
	* Utwse full sample for placement rates 
	if "`var'"=="fc"{
		qui: reghdfe `var' i.grpworker_id, resid absorb($fixedeffect) cluster(worker_id childpartyid)
	}
	
	* For twselectively obtwserved vars, condition on not being placed  
	if "`var'"!="fc"{
		qui: reghdfe `var' i.grpworker_id if fc == 0, resid absorb($fixedeffect) cluster(worker_id childpartyid)
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen base_`var' = .
	gen twse_`var' = .
	
	qui{
	foreach i in `levels' {
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		
		if _rc == 0 {
            replace base_`var' =  r(estimate) if grpworker_id == `i'
			replace twse_`var'= r(twse) if  grpworker_id==`i'
		}
	}
	}
	drop xbd xb d
	* By race (black and white)
	* For fc placement outcome, utwse the entire sample
	qui {
	if "`var'"=="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black, absorb($fixedeffect) resid cluster(worker_id childpartyid)
	}
	
	* For other outcomes, condition on not being placed 
	if "`var'"!="fc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black if fc==0, resid absorb($fixedeffect) cluster(worker_id childpartyid)
	}
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen b_`var' = .
	gen b_twse_`var' = .
	gen w_`var' = .
	gen w_twse_`var' = .
   
   	* Quietly
	qui {
	* White children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		if _rc == 0 {
            replace w_`var' =  r(estimate) if grpworker_id == `i'
			replace w_twse_`var'= r(twse) if  grpworker_id==`i'
		}
	}
	* Black children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + _b[`i'.grpworker_id#1.black] + `exp'
		if _rc == 0 {
            replace b_`var' =  r(estimate) if grpworker_id == `i'
			replace b_twse_`var'= r(twse) if  grpworker_id==`i'
		}
	}
	}
	drop x xbd xb d
}


*----------------------------------------------------------------------*
* Save investigator-level dataset 
*----------------------------------------------------------------------*
bys worker_id: gen count_inv = _N
bys worker_id: egen count_black = total(black)
bys worker_id : egen count_white = total(white)

* Keep relevant vars
keep base_* b_* w_* twse_* worker_id count_inv count_black count_white grpworker_id
duplicates drop worker_id, force 	
drop if worker_id == .

save "${tmpdata}inv_twse_later_sample_n_qje.dta", replace



*************************
**(5) GENERATE FINAL DATASETS
*************************
*Start with dataset from mandated reporters
	use "${tmpdata}inv_rates_later_sample_m_qje.dta", clear
	merge 1:1 worker_id using "${tmpdata}inv_twse_later_sample_m_qje.dta", keep(3) keepus(b_twse* w_twse*)

	*Gen additional variables needed for the main analysis 
	gen share_black = count_black/count_inv 
	egen bshare = wtmean(share_black), weight(count_inv)
	
	foreach x in w_fc b_fc {
		replace `x' = 1 - `x'
	}
	
	foreach x in w_fc b_fc w_inv6m b_inv6m {
		replace `x'=0 if `x'<0 
		replace `x'=1 if `x'>1
	}
		
	rename (w_fc b_fc w_inv6m b_inv6m) (D_w D_b Y_w Y_b)
	
	gen D_w2 = D_w*D_w 
	gen D_b2 = D_b*D_b

	save "${cleandata}inv_later_sample_m_qje.dta", replace
	
*Repeat for dataset from non-mandated reporters	
	use "${tmpdata}inv_rates_later_sample_n_qje.dta", clear
	merge 1:1 worker_id using "${tmpdata}inv_twse_later_sample_n_qje.dta", keep(3) keepus(b_twse* w_twse*)

	*Gen additional variables needed for the main analysis 
	gen share_black = count_black/count_inv 
	egen bshare = wtmean(share_black), weight(count_inv)
	
	foreach x in w_fc b_fc {
		replace `x' = 1 - `x'
	}
	
	foreach x in w_fc b_fc w_inv6m b_inv6m {
		replace `x'=0 if `x'<0 
		replace `x'=1 if `x'>1
	}
			
	rename (w_fc b_fc w_inv6m b_inv6m) (D_w D_b Y_w Y_b)
	
	gen D_w2 = D_w*D_w 
	gen D_b2 = D_b*D_b

	save "${cleandata}inv_later_sample_n_qje.dta", replace